import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import squarify
from scipy.spatial.distance import cdist
import seaborn as sns
plt.style.use('fivethirtyeight')
from sklearn.cluster import SpectralClustering, DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
excel_data_df1 = pd.read_excel('crimes.xlsx', sheet_name='Server_CH258-113')
df_crime = pd.DataFrame(excel_data_df1)
excel_data_df2 = pd.read_excel('cities.xlsx', sheet_name='2018')
df_cities = pd.DataFrame(excel_data_df2)
excel_data_df3 = pd.read_excel('mapping.xlsx', sheet_name='דת היישוב')
df_religion = pd.DataFrame(excel_data_df3)
excel_data_df4 = pd.read_excel('mapping.xlsx', sheet_name='מחוז ונפה')
df_districts = pd.DataFrame(excel_data_df4)
excel_data_df5 = pd.read_excel('mapping.xlsx', sheet_name='מעמד מוניציפלי')
df_municipal_status = pd.DataFrame(excel_data_df5)
excel_data_df6 = pd.read_excel('mapping.xlsx', sheet_name='אזור טבעי ')
df_natural_area = pd.DataFrame(excel_data_df6)
excel_data_df7 = pd.read_excel('mapping.xlsx', sheet_name='שיוך מטרופוליני')
df_metropolin_affiliation = pd.DataFrame(excel_data_df7)
excel_data_df8 = pd.read_excel('mapping.xlsx', sheet_name='צורת יישוב')
df_shape = pd.DataFrame(excel_data_df8)
excel_data_df9 = pd.read_excel('mapping.xlsx', sheet_name='השתייכות אירגונית')
df_organizational_affiliation = pd.DataFrame(excel_data_df9)
excel_data_df3 = pd.read_excel('mapping.xlsx', sheet_name='תחנות משטרה')
df_police = pd.DataFrame(excel_data_df3)
excel_data_df3 = pd.read_excel('mapping.xlsx', sheet_name='אשכול רשויות מקומי')
df_local_authority_cluster = pd.DataFrame(excel_data_df3)
df_cities.columns = ['hebrew_name','symbol','transliteration','district',
'subdistrict','natural_zone','Municipal status','Metropolitan affiliation'
,'religion','population_2018','jews & others', 'jews out of' ,
'arab', 'year of founding', 'shape', 'Organizational affiliation' , 'coordinates',
'height', 'Planning Committee' , 'police space' , 'year', 'english_name', 'Local authority cluster']
# Check if the information we have is full or missing values
# df_cities:
l = []
for col in df_cities.columns[2:]:
l.append([col, df_cities[col].isnull().sum()])
number_of_nulls = pd.DataFrame(l, columns = ["Column", "Number of NaN values"])
del l
number_of_nulls
| Column | Number of NaN values | |
|---|---|---|
| 0 | transliteration | 221 |
| 1 | district | 0 |
| 2 | subdistrict | 0 |
| 3 | natural_zone | 148 |
| 4 | Municipal status | 77 |
| 5 | Metropolitan affiliation | 929 |
| 6 | religion | 236 |
| 7 | population_2018 | 264 |
| 8 | jews & others | 295 |
| 9 | jews out of | 321 |
| 10 | arab | 938 |
| 11 | year of founding | 393 |
| 12 | shape | 0 |
| 13 | Organizational affiliation | 671 |
| 14 | coordinates | 32 |
| 15 | height | 196 |
| 16 | Planning Committee | 166 |
| 17 | police space | 164 |
| 18 | year | 0 |
| 19 | english_name | 221 |
| 20 | Local authority cluster | 765 |
# showing the null values in the train data
sns.heatmap(df_cities.isnull(),yticklabels=False,cbar=False,cmap='cividis')
cols = list(df_cities.columns)
print('train data:')
for c in cols:
print(c, ': ' , df_cities[c].isna().sum() ,' from :', df_cities['symbol'].count() )
train data: hebrew_name : 0 from : 1482 symbol : 0 from : 1482 transliteration : 221 from : 1482 district : 0 from : 1482 subdistrict : 0 from : 1482 natural_zone : 148 from : 1482 Municipal status : 77 from : 1482 Metropolitan affiliation : 929 from : 1482 religion : 236 from : 1482 population_2018 : 264 from : 1482 jews & others : 295 from : 1482 jews out of : 321 from : 1482 arab : 938 from : 1482 year of founding : 393 from : 1482 shape : 0 from : 1482 Organizational affiliation : 671 from : 1482 coordinates : 32 from : 1482 height : 196 from : 1482 Planning Committee : 166 from : 1482 police space : 164 from : 1482 year : 0 from : 1482 english_name : 221 from : 1482 Local authority cluster : 765 from : 1482
We will fill the missing population data by the shape of each settelement. Each shape describes the aproximate type and size of each settelment. We replaced the values by the median of that value
We will fill the missing religion data by the same logic.
cities=df_cities.copy()
cities['population_2018'] = cities['population_2018'].fillna(0)
for index, row in cities.iterrows():
if row['population_2018'] == 0.0 :
if row['shape'] == 120:
cities.at[index,'population_2018']= 500000
elif row['shape'] == 130:
cities.at[index,'population_2018']= 300000
elif row['shape'] == 140:
cities.at[index,'population_2018']= 150000
elif row['shape'] == 150:
cities.at[index,'population_2018']= 75000
elif row['shape'] == 160 or row['shape'] == 260 :
cities.at[index,'population_2018']= 25000
elif row['shape'] == 250:
cities.at[index,'population_2018']= 75000
elif row['shape'] == 170 or row['shape'] == 270:
cities.at[index,'population_2018']= 15000
elif row['shape'] == 180 or row['shape'] == 280:
cities.at[index,'population_2018']= 7000
elif row['shape'] == 190 or row['shape'] == 191 or row['shape'] == 192 or row['shape'] == 193 or row['shape'] == 290 :
cities.at[index,'population_2018']= 2500
else:
cities.at[index,'population_2018']= 2500
#religion filling:
cities['religion'] = cities['religion'].fillna(0)
for index, row in cities.iterrows():
if row['religion'] == 0.0 :
if row['shape'] == 120 or row['shape'] == 130 or row['shape'] == 140 or row['shape'] == 150 or row['shape'] == 180 or row['shape'] == 190 or row['shape'] == 340:
cities.at[index,'religion']= 1
elif row['shape'] == 250 or row['shape'] == 160 or row['shape'] == 260 or row['shape'] == 270 or row['shape'] == 280 or row['shape'] == 290 or row['shape'] == 440:
cities.at[index,'religion']= 2
elif row['shape'] == 460:
cities.at[index,'religion']= 3
we will fill the missing nationallity values using the shape to find which type of settelement that is (jews, arab or other): For jews settelments we will fill 90% jews and others , 10% arab. For jews out of we will use 90% out of "jews & others" . For arab settelments we will fill 90% of population and 10% jews.
cities['jews & others'] = cities['jews & others'].fillna(0)
cities['arab'] = cities['arab'].fillna(0)
cities['jews out of'] = cities['jews out of'].fillna(0)
for index, row in cities.iterrows():
if row['jews & others'] == 0.0 :
if row['shape'] in(120,130,140,150,160,170,180,190,191,192,193):
cities.at[index,'jews & others']= round(row['population_2018']*0.9)
else:
cities.at[index,'jews & others']= round(row['population_2018']*0.1) # we assume the population would be very small
if row['arab'] == 0.0 :
if row['shape'] in(250,260,270,280,290):
cities.at[index,'arab']= round(row['population_2018']*0.9)
else:
cities.at[index,'arab']= round(row['population_2018']*0.1) # we assume the population would be very small
if row['jews out of'] == 0.0 :
cities.at[index,'jews out of']= round(row['jews & others']*0.9)
#fill height with 0 :
cities["height"].fillna("0",inplace = True)
cities["Municipal status"].fillna("-",inplace = True)
cities["Organizational affiliation"].fillna("19",inplace = True)
#columns we see no use for in futher analysis
cities = cities.drop(['year of founding','year',"transliteration" ], 1)
cities['police space'] = cities['police space'].fillna(0)
for index, row in cities.iterrows():
if row['police space'] == 0.0 :
if row['subdistrict'] in(11,72,73,74,75,76,77) : #jerusalem district
cities.at[index,'police space']= random.choice((10002489,10002521,10002475,10002507,15000597,10002443,15000706,15000705,10002556))
elif row['subdistrict'] == 21:
cities.at[index,'police space']= 10004528
elif row['subdistrict'] == 22:
cities.at[index,'police space']= 10004552
elif row['subdistrict'] == 23:
cities.at[index,'police space']= 10004394
elif row['subdistrict'] == 24:
cities.at[index,'police space']=10004315
elif row['subdistrict'] == 25:
cities.at[index,'police space']=10004426
elif row['subdistrict'] == 29:
cities.at[index,'police space']=10004579
elif row['subdistrict'] == 31:
cities.at[index,'police space']=10004171
elif row['subdistrict'] == 32:
cities.at[index,'police space']=10004261
elif row['subdistrict'] == 41: #sharon close to natanya
cities.at[index,'police space']=10002089
elif row['subdistrict'] == 42:
cities.at[index,'police space']=10002116
elif row['subdistrict'] == 43:
cities.at[index,'police space']=10002228
elif row['subdistrict'] == 44:
cities.at[index,'police space']=10002241
elif row['subdistrict'] in (51, 52, 53):
cities.at[index,'police space']= random.choice((10001695,15002211,10003709,10001473,10001516,10001533,10001627,10001605,10001579,10001430))
elif row['subdistrict'] == 61:
cities.at[index,'police space']=10001805
elif row['subdistrict'] == 62:
cities.at[index,'police space']=10001900
elif row['subdistrict'] == 71:#genin close to afula
cities.at[index,'police space']=10004394
#dropping columns with more than half NaN values
cities=cities.drop(['Metropolitan affiliation', 'Local authority cluster'], axis=1)
# showing the null values in the train data
sns.heatmap(cities.isnull(),yticklabels=False,cbar=False,cmap='cividis')
cols = list(cities.columns)
print('train data:')
for c in cols:
print(c, ': ' , df_cities[c].isna().sum() ,' from :', cities['symbol'].count() )
train data: hebrew_name : 0 from : 1482 symbol : 0 from : 1482 district : 0 from : 1482 subdistrict : 0 from : 1482 natural_zone : 148 from : 1482 Municipal status : 77 from : 1482 religion : 236 from : 1482 population_2018 : 264 from : 1482 jews & others : 295 from : 1482 jews out of : 321 from : 1482 arab : 938 from : 1482 shape : 0 from : 1482 Organizational affiliation : 671 from : 1482 coordinates : 32 from : 1482 height : 196 from : 1482 Planning Committee : 166 from : 1482 police space : 164 from : 1482 english_name : 221 from : 1482
#we chose to remove 'sasa' for it's collision with 'savyon' symbol:
cities = cities.loc[cities["hebrew_name"] != 'סאסא']
cities.head(2)
| hebrew_name | symbol | district | subdistrict | natural_zone | Municipal status | religion | population_2018 | jews & others | jews out of | arab | shape | Organizational affiliation | coordinates | height | Planning Committee | police space | english_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | אבו ג'ווייעד (שבט) | 967 | 6 | 62 | 623.0 | - | 3.0 | 2500.0 | 250.0 | 0.0 | 250.0 | 460 | 19 | 2.040057e+09 | 0 | 699.0 | 15003711.0 | Abu Juway'ad |
| 1 | אבו גוש | 472 | 1 | 11 | 111.0 | 99 | 2.0 | 7543.0 | 97.0 | 79.0 | 7446.0 | 280 | 19 | 2.105263e+09 | 598 | 152.0 | 10002475.0 | Abu Ghosh |
# df_religion:
df_religion.columns=[ "religion","symbol"]
# dropping 2 unnececery rows:
df_religion = df_religion.drop(labels=[0, 1], axis=0)
# df_metropolin_affiliation:
df_metropolin_affiliation.columns = ['district','ring','name_metro','symbol_mt']
#we dont find use to it :
df_metropolin_affiliation = df_metropolin_affiliation.drop('ring', 1)
# dropping 2 unnececery rows:
df_metropolin_affiliation = df_metropolin_affiliation.drop(labels=[0, 1], axis=0)
#Filling all NaN values with correct values:
df_metropolin_affiliation['name_metro'] = df_metropolin_affiliation['name_metro'].fillna(0)
val = ''
for index, row in df_metropolin_affiliation.iterrows():
if row['name_metro'] != 0 :
val = row['name_metro']
else:
row['name_metro'] =val
# df_districts:
df_districts.columns = ['sub_napa','napa','district','symbol']
# dropping 2 unnececery rows:
df_districts = df_districts.drop(labels=[0, 1, 27, 28, 29 ,30], axis=0)
#Filling all NaN values with correct values:
df_districts['district'] = df_districts['district'].fillna(0)
val = ''
for index, row in df_districts.iterrows():
if row['district'] != 0 :
val = row['district']
else:
row['district'] =val
# df_municipal_status:
df_municipal_status.columns = ['monicipal_status','symbol']
#Drop NaN values:
df_municipal_status.dropna()
# dropping 2 unnececery rows:
df_municipal_status = df_municipal_status.drop(labels=[0, 1 , 10], axis=0)
# df_natural_area:
df_natural_area.columns = ['drop1', 'drop2' , 'name_natoral_area' , 'napa','district', 'symbol']
df_natural_area = df_natural_area.drop(labels=["drop1", "drop2"], axis=1)
df_natural_area = df_natural_area.drop(labels=[0, 1], axis=0)
#Filling all NaN values with correct values:
df_natural_area['district'] = df_natural_area['district'].fillna(0)
val = ''
for index, row in df_natural_area.iterrows():
if row['district'] != 0 :
val = row['district']
else:
row['district'] =val
#Filling all NaN values with correct values:
df_natural_area['napa'] = df_natural_area['napa'].fillna(0)
val = 'נפת ירושליים'
for index, row in df_natural_area.iterrows():
if row['napa'] != 0 :
val = row['napa']
else:
row['napa'] =val
# df_shape:
df_shape.columns = [ 'shape','type', 'symbol']
df_shape = df_shape.drop(labels=[0, 1], axis=0)
#Filling all NaN values with correct values:
df_shape['type'] = df_shape['type'].fillna(0)
val = ''
for index, row in df_shape.iterrows():
if row['type'] != 0 :
val = row['type']
else:
row['type'] =val
#Drop NaN values:
df_shape = df_shape.drop(labels=[2,19, ], axis=0)
# df_organizational_affiliation:
df_organizational_affiliation.columns = [ 'orginaztion_affliction','symbol']
df_organizational_affiliation = df_organizational_affiliation.drop(labels=[0, 1,2], axis=0)
# df_police:
df_police.columns = [ 'symbol','station','space', 'district']
df_police = df_police.drop(labels=[0, 1,2], axis=0)
#Filling all NaN values with correct values:
df_police['space'] = df_police['space'].fillna(0)
val = ''
for index, row in df_police.iterrows():
if row['space'] != 0 :
val = row['space']
else:
row['space'] =val
#Filling all NaN values with correct values:
df_police['district'] = df_police['district'].fillna(0)
val = ''
for index, row in df_police.iterrows():
if row['district'] != 0 :
val = row['district']
else:
row['district'] =val
# df_local_authority_cluster:
df_local_authority_cluster.columns = [ 'cluster_name', 'cluster_symbol']
df_local_authority_cluster = df_local_authority_cluster.drop(labels=[0, 1,2], axis=0)
df_crime.columns = ['symbol','location','desc_static_group','year_messege','Total_crimes_all_cities_4_years','2019','2018','2017','2016','2015','2014']
#change all '-' values to zero:
df_crime = df_crime.replace(['-'], 0 )
df_crime.head(2)
| symbol | location | desc_static_group | year_messege | Total_crimes_all_cities_4_years | 2019 | 2018 | 2017 | 2016 | 2015 | 2014 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | Total | NaN | NaN | 1973220 | 301142 | 320713 | 329265 | 328681 | 339804 | 353615 |
| 1 | 472.0 | אבו גוש | Total | NaN | 1997 | 338 | 284 | 310 | 340 | 323 | 402 |
# Check if the information we have is full or missing values
l = []
for col in df_crime.columns[2:]:
l.append([col, df_crime[col].isnull().sum()])
number_of_nulls = pd.DataFrame(l, columns = ["Column", "Number of NaN values"])
del l
number_of_nulls
# showing the null values in the train data
sns.heatmap(df_crime.isnull(),yticklabels=False,cbar=False,cmap='cividis')
cols = list(df_crime.columns)
print('train data:')
for c in cols:
print(c, ': ' , df_crime[c].isna().sum() ,' from :', df_crime['symbol'].count() )
train data: symbol : 1 from : 3682 location : 0 from : 3682 desc_static_group : 1 from : 3682 year_messege : 3683 from : 3682 Total_crimes_all_cities_4_years : 0 from : 3682 2019 : 0 from : 3682 2018 : 0 from : 3682 2017 : 0 from : 3682 2016 : 0 from : 3682 2015 : 0 from : 3682 2014 : 0 from : 3682
We chose to delete the 'year messege' column for it's uninformative characturisticts .
# unnacacery column, we will drop it:
df_crime = df_crime.drop('year_messege', 1)
#remove unknown offence:
df_crime = df_crime[df_crime.desc_static_group != 0]
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות בטחון'], 'security_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות כלכליות'], 'financial_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות כלפי המוסר'], 'moral_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות כלפי הרכוש'], 'property_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות מין'], 'sexual_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות מרמה'], 'fraud_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות נגד אדם'], 'human_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות מנהליות'], 'managment_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות נגד גוף'], 'body_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות סדר ציבורי'], 'public_order_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות רשוי'], 'licensing_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['עבירות תנועה'], 'traffic_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['שאר עבירות'], 'other_offences')
df_crime['desc_static_group'] = df_crime['desc_static_group'].replace(['סעיפי הגדרה'], 'definition_section_offences')
#merging demografic data with crime data using merge function- merges using 'symbol' identical rows
crimes_cities = pd.merge(df_crime, cities)
crimes_cities.head(2)
| symbol | location | desc_static_group | Total_crimes_all_cities_4_years | 2019 | 2018 | 2017 | 2016 | 2015 | 2014 | ... | jews & others | jews out of | arab | shape | Organizational affiliation | coordinates | height | Planning Committee | police space | english_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 472.0 | אבו גוש | Total | 1997 | 338 | 284 | 310 | 340 | 323 | 402 | ... | 97.0 | 79.0 | 7446.0 | 280 | 19 | 2.105263e+09 | 598 | 152.0 | 10002475.0 | Abu Ghosh |
| 1 | 472.0 | אבו גוש | security_offences | 55 | 7 | 9 | 8 | 6 | 15 | 10 | ... | 97.0 | 79.0 | 7446.0 | 280 | 19 | 2.105263e+09 | 598 | 152.0 | 10002475.0 | Abu Ghosh |
2 rows × 27 columns
#dataframes for each offence separetlly:
df_security_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'security_offences']
df_financial_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'financial_offences']
df_moral_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'moral_offences']
df_property_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'property_offences']
df_sexual_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'sexual_offences']
df_fraud_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'fraud_offences']
df_human_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'human_offences']
df_body_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'body_offences']
df_public_order_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'public_order_offences']
df_licensing_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'licensing_offences']
df_traffic_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'traffic_offences']
df_other_offences = crimes_cities.loc[crimes_cities.desc_static_group == 'other_offences']
security_offences_sum= df_security_offences.Total_crimes_all_cities_4_years.sum()
financial_offences_sum= df_financial_offences.Total_crimes_all_cities_4_years.sum()
moral_offences_sum= df_moral_offences.Total_crimes_all_cities_4_years.sum()
property_offences_sum= df_property_offences.Total_crimes_all_cities_4_years.sum()
sexual_offences_sum= df_sexual_offences.Total_crimes_all_cities_4_years.sum()
fraud_offences_sum= df_fraud_offences.Total_crimes_all_cities_4_years.sum()
human_offences_sum= df_human_offences.Total_crimes_all_cities_4_years.sum()
body_offences_sum= df_body_offences.Total_crimes_all_cities_4_years.sum()
public_order_offences_sum= df_public_order_offences.Total_crimes_all_cities_4_years.sum()
licensing_offences_sum= df_licensing_offences.Total_crimes_all_cities_4_years.sum()
traffic_offences_sum= df_traffic_offences.Total_crimes_all_cities_4_years.sum()
other_offences_sum= df_other_offences.Total_crimes_all_cities_4_years.sum()
data = [['Security', security_offences_sum],
['Morality', moral_offences_sum],['Property', property_offences_sum],
['Financial', financial_offences_sum],
['Sexual', sexual_offences_sum],['Fraud', fraud_offences_sum],
['Human', human_offences_sum],
['Body', body_offences_sum],['Public Order', public_order_offences_sum],
['Licensing', licensing_offences_sum],
['Traffic', traffic_offences_sum], ['Other', other_offences_sum]]
# Create the pandas DataFrame
df_total_per_offence = pd.DataFrame(data, columns = ['Offence', 'Total'])
#Data frame for visuallizing per offence- the number of crimes committed:
df_total_per_offence
| Offence | Total | |
|---|---|---|
| 0 | Security | 44480 |
| 1 | Morality | 169149 |
| 2 | Property | 740727 |
| 3 | Financial | 4396 |
| 4 | Sexual | 32106 |
| 5 | Fraud | 92902 |
| 6 | Human | 3514 |
| 7 | Body | 321568 |
| 8 | Public Order | 644085 |
| 9 | Licensing | 7444 |
| 10 | Traffic | 7209 |
| 11 | Other | 769 |
# install extra package for next plot:
# conda install -c conda-forge squarify
# plotting a tree map
plt.rcParams['figure.figsize'] = (15, 15)
plt.style.use('fivethirtyeight')
color = plt.cm.magma(np.linspace(0, 1, 15))
squarify.plot(sizes = df_total_per_offence['Total'], label = df_total_per_offence['Offence'], alpha=.8, color = color)
plt.title('Crimes in israel over 2014-2019', fontsize = 20)
plt.axis('off')
plt.show()
Contains total crimes pre persettelment:
total_crimes_per_city = crimes_cities[crimes_cities.desc_static_group == 'Total']
crimes_TOTAL_cities = pd.merge(total_crimes_per_city, cities)
crimes_TOTAL_cities =crimes_TOTAL_cities.drop(['desc_static_group' ] , 1)
crimes_TOTAL_cities.head(2)
| symbol | location | Total_crimes_all_cities_4_years | 2019 | 2018 | 2017 | 2016 | 2015 | 2014 | hebrew_name | ... | jews & others | jews out of | arab | shape | Organizational affiliation | coordinates | height | Planning Committee | police space | english_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 472.0 | אבו גוש | 1997 | 338 | 284 | 310 | 340 | 323 | 402 | אבו גוש | ... | 97.0 | 79.0 | 7446.0 | 280 | 19 | 2.105263e+09 | 598 | 152.0 | 10002475.0 | Abu Ghosh |
| 1 | 473.0 | אבו סנאן | 1699 | 288 | 260 | 301 | 272 | 291 | 287 | אבו סנאן | ... | 28.0 | 11.0 | 13887.0 | 270 | 19 | 2.160776e+09 | 19 | 252.0 | 10004315.0 | Abu Sinan |
2 rows × 26 columns
df_corrolation= crimes_TOTAL_cities.drop(['english_name','2019','2018', '2017','2016','2015','2014','hebrew_name','location'],1)
# crimes_NO_TOTAL_cities = df_crime[df_crime.desc_static_group != 'Total']
# crimes_NO_TOTAL_cities = pd.merge(crimes_NO_TOTAL_cities, cities)
From the corolation matrix we can inffer that the bigger the population - the larger the number of crimes committed. We can also see that this corollation is very strong when it comes to jews and other population. in contrast to the arab population - where a wicker connection between total crime and arab population. We can also see that the districts, subdistricts are highly corelated to the planning committee. We can infer for later work, that not all of the features are needed.
corrMatrix = df_corrolation.corr()
corrMatrix=corrMatrix.replace(np.nan, 0, regex=True)
sns.heatmap(corrMatrix, annot=True)
plt.rcParams["figure.figsize"] = [14, 14]
plt.show()
# conda install -c plotly plotly
From the Scatter matrix of features we can see again that the corolation between the population size and the total crimes committed. we can now see the our previous assumption that in jews and other communities the corolation is greater than the corolattion between the arab comunnity and total crimes commited. From this matrix we can see that the differences were merley becuse arab population in general are smaller than the jews and other communities, and the reason for the differences were not made from the type of population, but merly from it's size
# ''' Correlation Matrix '''
# import plotly.express as px
# fig = px.scatter_matrix(crimes_TOTAL_cities, color="Total_crimes_all_cities_4_years", title="Scatter matrix of features",width=950,
# height=1000, dimensions=['arab', 'shape','jews & others' , 'district', 'Local authority cluster','religion'])
# fig.show()
''' Correlation Matrix '''
import plotly.express as px
fig = px.scatter_matrix(crimes_TOTAL_cities, color="Total_crimes_all_cities_4_years", title="Scatter matrix of features",width=950,
height=1000, dimensions=['arab', 'shape','jews & others', 'jews out of', 'Planning Committee','religion'])
fig.show()
Here we see the top 15 rated settelmenets in crime presentage over 2014-2019:
# Plot "Settlements with high offences rates in Israel"
to_plot= crimes_TOTAL_cities.copy()
to_plot["crime precentage"] =crimes_TOTAL_cities["Total_crimes_all_cities_4_years"]/crimes_TOTAL_cities["population_2018"]
df = to_plot.sort_values('crime precentage', ascending=False)
df['percent'] = round(df['crime precentage'] * 100, 2)
# crimes_TOTAL_cities['english_name'] = crimes_TOTAL_cities['english_name']
fig = plt.figure(figsize=(15, 6))
ax = fig.add_subplot(111)
sns.barplot(y='english_name', x='crime precentage', data=df.head(15), palette=sns.color_palette("Set2", 4))
plt.title('Settlements with high offences rates in Israel (top-15)')
plt.ylabel('Settlements')
plt.xlabel('% of crimes')
plt.show()
Here we see the crime presentage in district. We added error bars thst shoes the variance in each district.
df = to_plot.sort_values('crime precentage', ascending=False).head(15)
df["district_name"] = ""
df['percent'] = round(df['crime precentage'] * 100, 2)
#function to set for each district code- its name:
def name_districts(df):
for index,row in df.iterrows():
if int(row['district']) == 1:
df.at[index,'district_name'] = 'Jerusalem '
elif int(row['district']) == 2:
df.at[index,'district_name'] = 'North'
elif int(row['district']) == 3 :
df.at[index,'district_name'] = 'Haifa'
elif int(row['district']) == 4:
df.at[index,'district_name'] = 'Center'
elif int(row['district']) == 5:
df.at[index,'district_name'] = 'Tel Aviv'
elif int(row['district']) == 6:
df.at[index,'district_name'] = 'South'
elif int(row['district']) == 7:
df.at[index,'district_name'] = 'Yehuda & Shomron'
return df
df=name_districts(df)
plt.figure(figsize = (15, 7))
plt.suptitle('Crime presentage for districts:', fontsize=20)
sns.barplot(y ="district_name", x = "percent", data = df, palette="cool")
<AxesSubplot:xlabel='percent', ylabel='district_name'>
The research question we'll be trying to answer in this section is: The corollation between a settelment size and crime presentage. In other words we expect mixed nationallities settelments to be higher on crime presentage. To do that we'll fit the data of crime using two clustering methods, after we have K clusters we'll view their districts and infer if we indeed found a correlation or not.
# crimes_cities
# ge= crimes_TOTAL_cities[['jews & others','arab','Total_crimes_all_cities_4_years']]
# ge
# fig = plotly.subplots.make_subplots(rows=1, cols=2, horizontal_spacing=0.03, specs=[[{"type": "xy"},{"type": "scatter3d"}]])
# fig.add_trace(
# go.Scatter( x=ge['jews & others'], y=ge['Total_crimes_all_cities_4_years'],marker_symbol='hexagon2', mode="markers+text",
# marker=dict(size=12,color='rgba(135, 206, 250, 0.7)', line=dict(width=1, color='DarkSlateGrey'))),
# row=1, col=1)
# fig.add_trace(
# go.Scatter3d(x=ge['jews & others'], y=ge['arab'], z=ge['Total_crimes_all_cities_4_years'],
# mode="markers",marker_symbol='circle-open', marker=dict(size=7)),
# row=1, col=2)
# fig.update_xaxes(title_text="x", row=1, col=1)
# fig.update_yaxes(title_text="y", row=1, col=1)
# fig.update_layout(height=600, width=1000,
# title_text="Plotting data after lowering the it's dimention, once in 2D the other 3D", showlegend=False)
# fig.show()
# ge["crime_precentage"] =crimes_TOTAL_cities["Total_crimes_all_cities_4_years"]/crimes_TOTAL_cities["population_2018"]
# summ = to_plot.sum()
# ge["mix"] =(crimes_TOTAL_cities["jews & others"]/summ['population_2018'] )*100
# clustering = ge[['crime_precentage', 'mix']]
# clustering
to_plot["crime_precentage"] =crimes_TOTAL_cities["Total_crimes_all_cities_4_years"]/crimes_TOTAL_cities["population_2018"]
summ = to_plot.sum()
to_plot["settelment_size"] =crimes_TOTAL_cities["population_2018"]/summ['population_2018']
df_clustering = to_plot[['crime_precentage', 'settelment_size']]
df_clustering
| crime_precentage | settelment_size | |
|---|---|---|
| 0 | 0.264749 | 0.000946 |
| 1 | 0.122098 | 0.001745 |
| 2 | 0.166715 | 0.001718 |
| 3 | 0.141350 | 0.006921 |
| 4 | 0.198856 | 0.003640 |
| ... | ... | ... |
| 195 | 0.150267 | 0.000871 |
| 196 | 0.228356 | 0.005213 |
| 197 | 0.431099 | 0.056634 |
| 198 | 0.220339 | 0.001613 |
| 199 | 0.201891 | 0.002534 |
200 rows × 2 columns
# Let's see the distrebution of each element, to decide what can be considered as an outliner,and thus- droppe
sns.displot(df_clustering, x="crime_precentage")
sns.displot(df_clustering, x="settelment_size")
<seaborn.axisgrid.FacetGrid at 0x7fb4000d9460>
# # Normallizing the data :
# df_clustering = df_clustering[df_clustering['crime_precentage'] < 0.4]
# df_clustering = df_clustering[df_clustering['settelment_size'] < 0.03]
df_clustering
| crime_precentage | settelment_size | |
|---|---|---|
| 0 | 0.264749 | 0.000946 |
| 1 | 0.122098 | 0.001745 |
| 2 | 0.166715 | 0.001718 |
| 3 | 0.141350 | 0.006921 |
| 4 | 0.198856 | 0.003640 |
| ... | ... | ... |
| 195 | 0.150267 | 0.000871 |
| 196 | 0.228356 | 0.005213 |
| 197 | 0.431099 | 0.056634 |
| 198 | 0.220339 | 0.001613 |
| 199 | 0.201891 | 0.002534 |
200 rows × 2 columns
We would like to see the connection berween the settelment size and its crime presentage. First we will prepare the data and visualize it, to see what clustering algorithm we think is prefered.
import plotly
import plotly.graph_objects as go
# Plot the data on a graph to see the nodes before and after clustering.
fig = plotly.subplots.make_subplots(rows=1, cols=2, horizontal_spacing=0.003, specs=[[{"type": "xy"},{"type": "scatter3d"}]])
fig.add_trace(
go.Scatter( x=df_clustering['crime_precentage'], y=df_clustering['settelment_size'],marker_symbol='hexagon2', mode="markers+text",
marker=dict(size=12,color='rgba(135, 206, 250, 0.7)', line=dict(width=1, color='DarkSlateGrey'))),
row=1, col=1)
fig.update_xaxes(title_text="Crime presentage", row=1, col=1)
fig.update_yaxes(title_text="Settelment size", row=1, col=1)
fig.update_layout(height=600, width=1000,
title_text="Visualizing the data in 2D graph", showlegend=False)
fig.show()
From the graph we cannot conclude a clear connection, therfore we chose KMeans algorithm to first see what the data means and than, if nedded choose the second algorithm more wisley.
we use The Elbow Method that shows the optimal k for the algorethim which means the number of the clusters that we will get, when we display the 'Sum of squared error' .acording to each K and the connecting between the points we will get and elbow shape the closest k to the elbow it will be the optimal k
distortions = []
K = range(1, 10)
for k in K:
kmeanModel = KMeans(n_clusters=k, max_iter=10000).fit(df_clustering)
#kmeanModel.fit(only_parties)
distortions.append(sum(np.min(cdist(df_clustering, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / df_clustering.shape[0])
plt.figure(figsize=(11, 5))
plt.plot(K, distortions, 'o-', markersize=12)
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.title('Optimal number of clusters')
plt.show()
kmeans = KMeans(n_clusters=5).fit(df_clustering)
centroids = kmeans.cluster_centers_
lables= kmeans.labels_
print(centroids)
plt.scatter(df_clustering["crime_precentage"], df_clustering["settelment_size"], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()
[[0.07869535 0.00231837] [0.20271809 0.00721146] [0.29506994 0.00670089] [0.14109607 0.00285011] [0.4890435 0.01634561]]
result=kmeans.labels_
me=pd.DataFrame(result)
me.columns =['point']
together= pd.concat([me, to_plot], axis=1)
together
| point | symbol | location | Total_crimes_all_cities_4_years | 2019 | 2018 | 2017 | 2016 | 2015 | 2014 | ... | shape | Organizational affiliation | coordinates | height | Planning Committee | police space | english_name | crime precentage | crime_precentage | settelment_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 472.0 | אבו גוש | 1997 | 338 | 284 | 310 | 340 | 323 | 402 | ... | 280 | 19 | 2.105263e+09 | 598 | 152.0 | 10002475.0 | Abu Ghosh | 0.264749 | 0.264749 | 0.000946 |
| 1 | 3 | 473.0 | אבו סנאן | 1699 | 288 | 260 | 301 | 272 | 291 | 287 | ... | 270 | 19 | 2.160776e+09 | 19 | 252.0 | 10004315.0 | Abu Sinan | 0.122098 | 0.122098 | 0.001745 |
| 2 | 3 | 182.0 | אבן יהודה | 2284 | 279 | 285 | 380 | 407 | 458 | 475 | ... | 170 | 19 | 1.894469e+09 | 10 | 457.0 | 15000060.0 | Even Yehuda | 0.166715 | 0.166715 | 0.001718 |
| 3 | 3 | 2710.0 | אום אל פחם | 7800 | 1459 | 1506 | 1160 | 1015 | 1308 | 1352 | ... | 250 | 19 | 2.145171e+09 | 139 | 354.0 | 10004249.0 | Umm al-Fahm | 0.141350 | 0.141350 | 0.006921 |
| 4 | 1 | 31.0 | אופקים | 5771 | 906 | 1066 | 979 | 949 | 959 | 912 | ... | 160 | 19 | 1.638258e+09 | 113 | 601.0 | 10001893.0 | Ofaqim | 0.198856 | 0.198856 | 0.003640 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195 | 3 | 538.0 | שעב | 1043 | 167 | 150 | 174 | 183 | 183 | 186 | ... | 280 | 19 | 2.226375e+09 | 20 | 263.0 | 15002411.0 | Sha'ab | 0.150267 | 0.150267 | 0.000871 |
| 196 | 1 | 8800.0 | שפרעם | 9490 | 1536 | 1851 | 1871 | 1500 | 1383 | 1349 | ... | 260 | 19 | 2.164075e+09 | 38 | 261.0 | 10004497.0 | Shefar'am | 0.228356 | 0.228356 | 0.005213 |
| 197 | 4 | 5000.0 | תל אביב יפו | 194651 | 29589 | 30267 | 30362 | 33033 | 34853 | 36547 | ... | 130 | 19 | 1.802766e+09 | -4 | 507.0 | 10001579.0 | Tel Aviv - Yafo | 0.431099 | 0.431099 | 0.056634 |
| 198 | 1 | 154.0 | תל מונד | 2834 | 298 | 466 | 427 | 504 | 601 | 538 | ... | 170 | 19 | 1.924068e+09 | 20 | 457.0 | 15000060.0 | Tel Mond | 0.220339 | 0.220339 | 0.001613 |
| 199 | 1 | 1054.0 | תל שבע | 4079 | 817 | 833 | 686 | 610 | 542 | 591 | ... | 260 | 19 | 1.867057e+09 | 280 | 652.0 | 10001937.0 | Tel Sheva | 0.201891 | 0.201891 | 0.002534 |
200 rows × 30 columns
li= kmeans.labels_
pd.DataFrame(li)
# gil.columns['cluster']
# together= pd.concat([gil, to_plot], axis=1)
# # together=pd.DataFrame.rename(together, columns={'0':'nx'})
# toget =together.groupby(by="cluster")
# # together[0]
# toget
li=pd.DataFrame(li)
li
| 0 | |
|---|---|
| 0 | 1 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 3 |
| ... | ... |
| 195 | 0 |
| 196 | 3 |
| 197 | 2 |
| 198 | 3 |
| 199 | 3 |
200 rows × 1 columns
We can see that most of the settelments, most of theri crimes are between 0%-10%. But the differnces is the disterbution. We can see that in larger settelments the disterbution in crime pesentege is larger, meaning the larger the settelment size is, the crime presentage disterbution is vary variant, i.e. more values (crime presentage). Werease in smaller settelments, the disrebution is smaller, and most of the small size setelments are mostly between 0%-10% crimes.
We chose GMM clustering from kmean conclusions. we saw that the differnces were in the data desterbution
to_GMM = np.column_stack((df_clustering['settelment_size'],df_clustering['crime_precentage'] ))
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4).fit(to_GMM)
labels = gmm.predict(to_GMM)
plt.scatter( to_GMM[:, 1],to_GMM[:, 0], c=labels, s=40, cmap='viridis');
The GMM algorithm supp ort our previous conclusion- lMost of the crimes are between 0%-10% regardless the settelments size (population wise). But- in larger settelment we can see a variant destreterbution in higer crime presentage.
Choosing features to work with: 1.From the corrolation matrix we saw a high corolation between 'district','subdistrict' and 'Planning Committee'. Therfore we will only choose 'Planning Committee'.
# take only numeric data fo random forest:
numeric_df= crimes_cities[['symbol','desc_static_group','Total_crimes_all_cities_4_years',
'2019','2018','2017','2016','2015','2014','religion', 'population_2018'
,'jews & others','arab' ,'jews out of', 'shape', 'Planning Committee']]
numeric_df=numeric_df[numeric_df['desc_static_group'] != 'Total']
#turning offences to numeric, for regression model:
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['security_offences'],1)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['financial_offences'],2)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['moral_offences'], 3)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['property_offences'],4)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['sexual_offences'], 5)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['fraud_offences'], 6)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['human_offences'], 7)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['managment_offences'], 8)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['body_offences'], 9)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['public_order_offences'], 10)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['licensing_offences'], 11)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['traffic_offences'], 12)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['other_offences'], 13)
numeric_df['desc_static_group'] = numeric_df['desc_static_group'].replace(['definition_section_offences'], 14)
numeric_df.head(2)
| symbol | desc_static_group | Total_crimes_all_cities_4_years | 2019 | 2018 | 2017 | 2016 | 2015 | 2014 | religion | population_2018 | jews & others | arab | jews out of | shape | Planning Committee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 472.0 | 1 | 55 | 7 | 9 | 8 | 6 | 15 | 10 | 2.0 | 7543.0 | 97.0 | 7446.0 | 79.0 | 280 | 152.0 |
| 2 | 472.0 | 2 | 3 | 1 | 0 | 0 | 0 | 2 | 0 | 2.0 | 7543.0 | 97.0 | 7446.0 | 79.0 | 280 | 152.0 |
#only relevent locations:
data_rand_forest_test = numeric_df[(numeric_df.symbol == 3780) | (numeric_df.symbol == 507) | (numeric_df.symbol==1139 )
| (numeric_df.symbol ==43 ) | (numeric_df.symbol==2640 ) | (numeric_df.symbol==7400 )]
data_rand_forest_train = numeric_df[(numeric_df.symbol != 3780) & (numeric_df.symbol != 507) &
(numeric_df.symbol!=1139 ) & (numeric_df.symbol!=43 ) &
(numeric_df.symbol!=2640 )& (numeric_df.symbol!=7400 )&
(numeric_df.desc_static_group!="לא ידוע" )]
#splittiong the data to training data and testing data:
#training for finding data about year 2018 as a training method:
X_train=data_rand_forest_train.drop(['2019'],1)
y_train=data_rand_forest_train["2019"]
#for testing: using year 2019:
X_test=data_rand_forest_test.drop(['2019'],1)
y_test=data_rand_forest_test["2019"]
#optimizing the parameters sent to randon forest algorithm:
param_grid = {
'criterion':['mse'],
'n_estimators': [150, 180, 200, 220], # The number of trees in the forest.
'max_depth': [None, 40, 60, 80], # The maximum depth of the tree.
'max_features': ['sqrt',4,5], # he number of features to consider when looking for the best split
'min_samples_split': [None, 3, 4], # The minimum number of samples required to split an internal node
'bootstrap': [True] # Whether bootstrap samples are used when building trees.
}
# Create a grid search object
gsRFC = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=-1, cv=5)
# Fit
gsRFC.fit(X_train, y_train)
gsRFC.best_params_
{'bootstrap': True,
'criterion': 'mse',
'max_depth': 40,
'max_features': 4,
'min_samples_split': 3,
'n_estimators': 200}
# run prediction using best estimetor found erlier (best hyperparameters using greed search:)
new_best_model = gsRFC.best_estimator_
ypred = new_best_model.predict(X_test)
# calulate MSE
#MSE :measures the average of the squares of the errors—that is
#the average squared difference between the estimated values and the actual value.
# The MSE is a measure of the quality of an estimator:
mse = mean_squared_error(y_test, ypred)
print('The model\'s MSE is ' + str(round(mse, 2)))
The model's MSE is 2683.99
# Create and get relevant data only
rf_graph = data_rand_forest_test.copy()
rf_graph['Pred'] = np.round(ypred)
rf_graph = rf_graph.rename(columns={'symbol': 'Settlement', '2019':'Test'})
rf_graph['MSE'] = (rf_graph['Pred']-rf_graph['Test'])**2
rf_graph['logMSE'] = np.round(np.log(rf_graph['MSE']), 2)
#Now we'll display the results in a table; actual number of legal votes, predicted number with MSE and log(MSE)
fig = go.Figure(data=[go.Table(
header=dict(values=['<b>Settlement</b>', '<b>Actual</b>', '<b>Predicted</b>', '<b>MSE</b>' ,'<b>log(MSE)</b>'],
line_color='darkslategray', fill_color='rgb(158,202,225)',
font=dict(size=13), align='center'),
cells=dict(values=[rf_graph['Settlement'], rf_graph['Test'], rf_graph['Pred'], rf_graph['MSE'] ,rf_graph['logMSE']],
line_color='darkslategray', fill_color = ['rgb(222,252,229)','white'],
align = 'center', font = dict(color = 'darkslategray', size = 13) ))
])
fig.update_layout(width=800, height=315, title='Actual number of each crimes per settelment vs. predicted number of crimes log(MSE)')
fig.show()
/Users/gylslmndr/opt/anaconda3/lib/python3.8/site-packages/pandas/core/series.py:726: RuntimeWarning: divide by zero encountered in log
to_graph= rf_graph.copy()
to_graph['total_predicted'] = to_graph['2018'] + to_graph['2017']+to_graph['2016'] + to_graph['2015']+ to_graph['2014']+ to_graph['Pred']
to_graph['tota_actual'] = to_graph['2018'] + to_graph['2017']+to_graph['2016'] + to_graph['2015']+ to_graph['2014']+ to_graph['Test']
to_graph.head(2)
| Settlement | desc_static_group | Total_crimes_all_cities_4_years | Test | 2018 | 2017 | 2016 | 2015 | 2014 | religion | ... | jews & others | arab | jews out of | shape | Planning Committee | Pred | MSE | logMSE | total_predicted | tota_actual | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 451 | 3780.0 | 14 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1.0 | ... | 56744.0 | 2.0 | 56726.0 | 150 | 716.0 | 0.0 | 1.0 | 0.00 | 0.0 | 1 |
| 452 | 3780.0 | 1 | 162 | 33 | 39 | 24 | 27 | 23 | 16 | 1.0 | ... | 56744.0 | 2.0 | 56726.0 | 150 | 716.0 | 27.0 | 36.0 | 3.58 | 156.0 | 162 |
2 rows × 21 columns
#get second most common crime for each settelment
#Actual results:
maximum= -1
max_2= -1
log_mse_max= 50
log_mse_max2= 50
settelment= 3780
crime_max= 0
crime_second= 0
list_settelments= list()
list_second_crimes= list()
list_logMSE= list()
list_number= list()
for index, row in to_graph.iterrows():
if row['Settlement'] != settelment:
list_settelments.append(settelment)
list_second_crimes.append(crime_second)
list_number.append(max_2)
list_logMSE.append(log_mse_max2)
settelment = to_graph.at[index,'Settlement'] # update settelment
maximum= -1
max_2= -1
crime_max= 0
crime_second= 0
log_mse_max= 50
log_mse_max2= 50
if row['Test'] >= maximum :
max_2=maximum
maximum= to_graph.at[index,'Test']
crime_max = to_graph.at[index,'desc_static_group']
crime_second = crime_max
log_mse_max2=log_mse_max
log_mse_max=to_graph.at[index,'logMSE']
if row['Test'] >= max_2 :
max_2= to_graph.at[index,'Test']
crime_second = to_graph.at[index,'desc_static_group']
log_mse_max2=to_graph.at[index,'logMSE']
#get second most common crime for each settelment
#Predicted results:
maximum= -1
max_2= -1
settelment= 3780
crime_max= 0
crime_second= 0
log_mse_max= 50
log_mse_max2= 50
list_number_pred= list()
list_settelments_pred= list()
list_second_crimes_pred= list()
list_number_pred= list()
for index, row in to_graph.iterrows():
if row['Settlement'] != settelment:
list_settelments_pred.append(settelment)
list_second_crimes_pred.append(crime_second)
list_number_pred.append(max_2)
settelment = to_graph.at[index,'Settlement'] # update settelment
maximum= -1
max_2= -1
crime_max= 0
crime_second= 0
if row['Pred'] >= maximum :
max_2=maximum
maximum= to_graph.at[index,'Pred']
crime_max = to_graph.at[index,'desc_static_group']
crime_second = crime_max
if row['Pred'] >= max_2 :
max_2= to_graph.at[index,'Pred']
crime_second = to_graph.at[index,'desc_static_group']
data_plot = pd.DataFrame(list(zip(list_settelments_pred, list_second_crimes_pred ,list_second_crimes,list_number_pred,list_number,list_logMSE)),
columns =['Settlement', '2nd crime Actual','2nd crime Predicted' ,'Pred','Test', 'log(SEM)'])
data_plot
| Settlement | 2nd crime Actual | 2nd crime Predicted | Pred | Test | log(SEM) | |
|---|---|---|---|---|---|---|
| 0 | 3780.0 | 4 | 4 | 192.0 | 248 | 8.05 |
| 1 | 507.0 | 10 | 10 | 160.0 | 183 | 6.27 |
| 2 | 1139.0 | 10 | 10 | 509.0 | 563 | 7.98 |
| 3 | 43.0 | 3 | 3 | 11.0 | 16 | 3.22 |
| 4 | 7400.0 | 4 | 4 | 3778.0 | 3379 | 11.98 |
data_plot['Settlement'] = data_plot['Settlement'].replace([3780.0], 'Beitar- Elit')
data_plot['Settlement'] = data_plot['Settlement'].replace([507.0], 'Kfar Yasif')
data_plot['Settlement'] = data_plot['Settlement'].replace([1139.0], 'Karmiel')
data_plot['Settlement'] = data_plot['Settlement'].replace([43.0], 'Metulla')
data_plot['Settlement'] = data_plot['Settlement'].replace([7400.0], 'Natanya')
data_plot['2nd crime Actual'] = data_plot['2nd crime Actual'].replace([4], 'property_offences')
data_plot['2nd crime Actual'] = data_plot['2nd crime Actual'].replace([10], 'public_order_offences')
data_plot['2nd crime Actual'] = data_plot['2nd crime Actual'].replace([3], 'moral_offences')
data_plot['2nd crime Predicted'] = data_plot['2nd crime Predicted'].replace([4], 'property_offences')
data_plot['2nd crime Predicted'] = data_plot['2nd crime Predicted'].replace([10], 'public_order_offences')
data_plot['2nd crime Predicted'] = data_plot['2nd crime Predicted'].replace([3], 'moral_offences')
data_plot
| Settlement | 2nd crime Actual | 2nd crime Predicted | Pred | Test | log(SEM) | |
|---|---|---|---|---|---|---|
| 0 | Beitar- Elit | property_offences | property_offences | 192.0 | 248 | 8.05 |
| 1 | Kfar Yasif | public_order_offences | public_order_offences | 160.0 | 183 | 6.27 |
| 2 | Karmiel | public_order_offences | public_order_offences | 509.0 | 563 | 7.98 |
| 3 | Metulla | moral_offences | moral_offences | 11.0 | 16 | 3.22 |
| 4 | Natanya | property_offences | property_offences | 3778.0 | 3379 | 11.98 |
we can see that the precictions for all setellments regarding the type second most common crime is correct. But, the numbers actually predicted for each crime is not 100% accurate.
In the next plot, we can see the differences between the size of the actual crimes in 2019, and the predicted size.
#Create the charts
test_trace = go.Bar(name='Test', x=data_plot['Settlement'], y=data_plot['Test'], text=data_plot['Test'], textposition='auto',
marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6)
pred_trace = go.Bar(name='Pred', x=data_plot['Settlement'], y=data_plot['Pred'], text=data_plot['Pred'], textposition='auto',
marker_color='rgb(180,300,200)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
error_trace = go.Bar(name='Error', x=data_plot['Settlement'], y=data_plot['log(SEM)'], text=data_plot['log(SEM)'], textposition='auto',
marker_color='rgb(224,127,127)', marker_line_color='rgb(255,255,255)',marker_line_width=1, opacity=0.6)
rf_fig = plotly.subplots.make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)
rf_fig.append_trace(test_trace, 1,1)
rf_fig.append_trace(pred_trace, 1,1)
rf_fig.append_trace(error_trace, 2, 1)
# Update xaxis and yaxis properties
rf_fig.update_xaxes(title_text="Settlement", row=2, col=1)
rf_fig.update_yaxes(title_text="Second most common crime (COUNT)", row=1, col=1)
rf_fig.update_yaxes(title_text="log(MSE)", row=2, col=1)
rf_fig['layout'].update(height=600, width=950, title={'text': "Final Results Comparison- 2nd most common crime in 2019", 'y':0.9, 'x':0.5, 'xanchor': 'center','yanchor': 'top'})
rf_fig.show()
Our goal is to predict the overall crimes in 2019 in the following settelments: Keryat Ata, Rosh Pina, Eylat, Sachnin and Beir Seva. Using Adaboost algorithm
train data:
X- all features (but names and features with NaN values) of untsrgeted settelments
y- the year 2019 for the untargeted settelments
test data:
X- all features (but names and features with NaN values) of targeted settelments
y- the year 2019 for the targeted settelments
# Removing columns with NaN values:
features= crimes_TOTAL_cities.drop(['location', 'english_name', 'hebrew_name','coordinates','natural_zone','Planning Committee'], axis =1)
#splitting the data to target settlments fortraining and testing:
ada_df_test=features[(features.symbol == 6800) | (features.symbol == 26) | (features.symbol==2600 )
| (features.symbol== 9000 ) | (features.symbol==7500 )]
ada_df_train=features[(features.symbol != 6800) &(features.symbol != 26)&(features.symbol!=2600 )
& (features.symbol!= 9000 ) & (features.symbol!=7500 )]
#for training we will use untargetted settelments :
X_train=ada_df_train.drop(['2019'],1)
y_train=ada_df_train["2019"]
#for testing we will use the targeted settelments :
X_test=ada_df_test.drop(['2019'],1)
y_test=ada_df_test["2019"]
We are using grid search for finding the best parameters to sent to the model. We will print the features the model found most segnificant (using adaboost regression), and those would be the features and parameters we will use .
# Create gridsearch
ada_paramgrid = {'n_estimators':[50,100,150,200,250],
'learning_rate':[0.001, 0.01, 0.1, 1],
'random_state': [1]}
def select_features(xtrain, ytrain, xtest):
sfm = SelectFromModel(AdaBoostRegressor())
sfm.fit(xtrain, ytrain)
# Printing the names of the most important features:
print('selected features : ')
for ind in sfm.get_support(indices=True):
print(xtrain.columns[ind])
return sfm.transform(xtrain), sfm.transform(xtest)
X, x_test = select_features(X_train, y_train, X_test)
gs1 = GridSearchCV(AdaBoostRegressor(), param_grid = ada_paramgrid, cv=5, scoring = 'neg_mean_absolute_error', n_jobs= -1)
gs1.fit(X, y_train)
ada1 = gs1.best_estimator_
selected features : Total_crimes_all_cities_4_years 2018 2017 2016 2015 2014 population_2018 jews out of
#Predict Data:
y_pred = ada1.predict(x_test)
mse1 = mean_squared_error(y_test, y_pred)
print('MSE: ' + str(round(mse1,2)))
MSE: 162691.02
TestingDataResults=pd.DataFrame(X_test)
TestingDataResults['Actual']=y_test
round_to_whole = [round(num) for num in y_pred]
TestingDataResults['Prediction']= round_to_whole
TestingDataResults
| symbol | Total_crimes_all_cities_4_years | 2018 | 2017 | 2016 | 2015 | 2014 | district | subdistrict | Municipal status | ... | population_2018 | jews & others | jews out of | arab | shape | Organizational affiliation | height | police space | Actual | Prediction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 2600.0 | 33612 | 5410 | 5459 | 6027 | 5972 | 6063 | 6 | 62 | 0 | ... | 51935.0 | 49675.0 | 41906.0 | 2260.0 | 150 | 19 | -5 | 10001955.0 | 4681 | 4900 |
| 22 | 9000.0 | 77458 | 12486 | 13277 | 12867 | 12736 | 13284 | 6 | 62 | 0 | ... | 209002.0 | 203622.0 | 181644.0 | 5380.0 | 130 | 19 | 196 | 10001900.0 | 12808 | 12037 |
| 138 | 7500.0 | 4720 | 772 | 850 | 844 | 765 | 776 | 2 | 24 | 0 | ... | 31057.0 | 26.0 | 9.0 | 31031.0 | 260 | 19 | 178 | 10004513.0 | 713 | 449 |
| 166 | 6800.0 | 13469 | 1998 | 2280 | 2003 | 2192 | 3011 | 3 | 31 | 0 | ... | 58267.0 | 58154.0 | 53785.0 | 113.0 | 150 | 19 | 1 | 10004207.0 | 1985 | 1722 |
| 179 | 26.0 | 1241 | 207 | 208 | 232 | 202 | 216 | 2 | 21 | 99 | ... | 3120.0 | 3081.0 | 3007.0 | 39.0 | 190 | 19 | 150 | 10004540.0 | 176 | 355 |
5 rows × 21 columns
TestingDataResults['MSE'] = (TestingDataResults['Prediction']-TestingDataResults['Actual'])**2
TestingDataResults['logMSE'] = np.round(np.log(TestingDataResults['MSE']), 2)
TestingDataResults['symbol'] = TestingDataResults['symbol'].replace([2600.0], 'Keryat Ata')
TestingDataResults['symbol'] = TestingDataResults['symbol'].replace([9000.0], 'Bear Sheva')
TestingDataResults['symbol'] = TestingDataResults['symbol'].replace([7500.0], 'Sachnin')
TestingDataResults['symbol'] = TestingDataResults['symbol'].replace([26.0], 'Rosh Pina')
TestingDataResults['symbol'] = TestingDataResults['symbol'].replace([6800.0], 'Eylat')
#Create the charts
test_trace = go.Bar(name='TargetColumn', x=TestingDataResults['symbol'], y=TestingDataResults['Actual'], text=TestingDataResults['Actual'], textposition='auto',
marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6)
pred_trace = go.Bar(name='Prediction', x=TestingDataResults['symbol'], y=TestingDataResults['Prediction'], text=TestingDataResults['Prediction'], textposition='auto',
marker_color='rgb(180,300,200)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
error_trace = go.Bar(name='Error', x=TestingDataResults['symbol'], y=TestingDataResults['logMSE'], text=TestingDataResults['logMSE'], textposition='auto',
marker_color='rgb(224,127,127)', marker_line_color='rgb(255,255,255)',marker_line_width=1, opacity=0.6)
rf_fig = plotly.subplots.make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)
rf_fig.append_trace(test_trace, 1,1)
rf_fig.append_trace(pred_trace, 1,1)
rf_fig.append_trace(error_trace, 2, 1)
# Update xaxis and yaxis properties
rf_fig.update_xaxes(title_text="Settlement", row=2, col=1)
rf_fig.update_yaxes(title_text="Total crime in 2019", row=1, col=1)
rf_fig.update_yaxes(title_text="log(MSE)", row=2, col=1)
rf_fig['layout'].update(height=600, width=950, title={'text': "Final Results Comparison- Total crimes in 2019", 'y':0.9, 'x':0.5, 'xanchor': 'center','yanchor': 'top'})
rf_fig.show()
In the next task we decided to predict police strategy in every city by using RandomForestClassifier. First we calculated all necessary resources in each city, every year.
Next, for every pair of years for each city we calculated percent of growth/decrease in this resources, depending on first year in the pair. We decided that the best rate will be 20%, which in our opinion is the optimal. Because lower rate will cause many suggestions for changing amount of resources (which is not happens in our days in police every year), and higher rate will miss important changes.
Our strategy decision was calculated as follows: if amount of resources grew more than by 20% in next year – it will be labeled as 1, decrease by 20% percent will be labeled as -1, and others changes will be labeled as 0.
wh_in_cities = crimes_cities
#remove incorect total calculation from data
wh_in_cities = wh_in_cities[wh_in_cities.desc_static_group != 'Total']
# Amount of working hours per offense
security_offences = 20
financial_offences = 5
moral_offences = 15
property_offences = 10
sexual_offences = 35
fraud_offences = 25
human_offences = 40
body_offences = 30
public_order_offences = 1
licensing_offences = 1
traffic_offences = 2
years = ['2019','2018','2017','2016','2015','2014']
for year in years:
calc_work_hours = []
for index, row in wh_in_cities.iterrows():
if row['desc_static_group'] == 'security_offences' : calc_work_hours.append(security_offences * row[year])
elif row['desc_static_group'] == 'financial_offences': calc_work_hours.append(financial_offences * row[year])
elif row['desc_static_group'] == 'moral_offences': calc_work_hours.append(moral_offences * row[year])
elif row['desc_static_group'] == 'property_offences': calc_work_hours.append(property_offences * row[year])
elif row['desc_static_group'] == 'sexual_offences': calc_work_hours.append(sexual_offences * row[year])
elif row['desc_static_group'] == 'fraud_offences': calc_work_hours.append(fraud_offences * row[year])
elif row['desc_static_group'] == 'human_offences': calc_work_hours.append(human_offences * row[year])
elif row['desc_static_group'] == 'body_offences': calc_work_hours.append(body_offences * row[year])
elif row['desc_static_group'] == 'public_order_offences': calc_work_hours.append(public_order_offences * row[year])
elif row['desc_static_group'] == 'licensing_offences': calc_work_hours.append(licensing_offences * row[year])
elif row['desc_static_group'] == 'traffic_offences': calc_work_hours.append(traffic_offences * row[year])
else: calc_work_hours.append(0)
wh_in_cities['wh_'+ year] = calc_work_hours
<ipython-input-221-698b7ee0ca9b>:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#choosing dataframe with relevant features
city_strategies = wh_in_cities_totals.reset_index()
city_strategies = city_strategies[['symbol','wh_2019','wh_2018','wh_2017','wh_2016','wh_2015','wh_2014','Total_crimes_all_cities_4_years','religion', 'population_2018',
'jews & others', 'jews out of', 'arab']]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-222-4ff0a34db67e> in <module> 1 #choosing dataframe with relevant features ----> 2 city_strategies = wh_in_cities_totals.reset_index() 3 city_strategies = city_strategies[['symbol','wh_2019','wh_2018','wh_2017','wh_2016','wh_2015','wh_2014','Total_crimes_all_cities_4_years','religion', 'population_2018', 4 'jews & others', 'jews out of', 'arab']] NameError: name 'wh_in_cities_totals' is not defined
#calculating strategies for all cities in all years
percent = 0.2 # Our rate
strategies = {}
strat_2015 =[]
strat_2016 =[]
strat_2017 =[]
strat_2018 =[]
strat_2019 =[]
#######2015 ######
for index, row in city_strategies.iterrows():
if row['wh_2015'] - row['wh_2014'] > 0:
if row['wh_2015'] - row['wh_2014'] > row['wh_2014']*percent: strat_2015.append(1)
else: strat_2015.append(0)
else:
if abs(row['wh_2015'] - row['wh_2014']) > row['wh_2014']*percent: strat_2015.append(-1)
else: strat_2015.append(0)
#######2016 ######
for index, row in city_strategies.iterrows():
if row['wh_2016'] - row['wh_2015'] > 0:
if row['wh_2016'] - row['wh_2015'] > row['wh_2015']*percent: strat_2016.append(1)
else: strat_2016.append(0)
else:
if abs(row['wh_2016'] - row['wh_2015']) > row['wh_2015']*percent: strat_2016.append(-1)
else: strat_2016.append(0)
#######2017 ######
for index, row in city_strategies.iterrows():
if row['wh_2017'] - row['wh_2016'] > 0:
if row['wh_2017'] - row['wh_2016'] > row['wh_2016']*percent: strat_2017.append(1)
else: strat_2017.append(0)
else:
if abs(row['wh_2017'] - row['wh_2016']) > row['wh_2016']*percent: strat_2017.append(-1)
else: strat_2017.append(0)
#######2018 ######
for index, row in city_strategies.iterrows():
if row['wh_2018'] - row['wh_2017'] > 0:
if row['wh_2018'] - row['wh_2017'] > row['wh_2017']*percent: strat_2018.append(1)
else: strat_2018.append(0)
else:
if abs(row['wh_2018'] - row['wh_2017']) > row['wh_2017']*percent: strat_2018.append(-1)
else: strat_2018.append(0)
#######2019 ######
for index, row in city_strategies.iterrows():
if row['wh_2019'] - row['wh_2018'] > 0:
if row['wh_2019'] - row['wh_2018'] > row['wh_2018']*percent: strat_2019.append(1)
else: strat_2019.append(0)
else:
if abs(row['wh_2019'] - row['wh_2018']) > row['wh_2018']*percent: strat_2019.append(-1)
else: strat_2019.append(0)
city_strategies['strat_2015'] = strat_2015
city_strategies['strat_2016'] = strat_2016
city_strategies['strat_2017'] = strat_2017
city_strategies['strat_2018'] = strat_2018
city_strategies['strat_2019'] = strat_2019
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-223-0671f5c2c22c> in <module> 9 strat_2019 =[] 10 #######2015 ###### ---> 11 for index, row in city_strategies.iterrows(): 12 if row['wh_2015'] - row['wh_2014'] > 0: 13 if row['wh_2015'] - row['wh_2014'] > row['wh_2014']*percent: strat_2015.append(1) NameError: name 'city_strategies' is not defined
#training model
from sklearn.ensemble import RandomForestClassifier
X_train = city_strategies.drop(['strat_2018'],1)
y_train = city_strategies['strat_2018']
#testing model
X_test = city_strategies.drop(['strat_2019'],1)
y_test = city_strategies['strat_2019']
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-224-92fbdd6980c3> in <module> 2 from sklearn.ensemble import RandomForestClassifier 3 ----> 4 X_train = city_strategies.drop(['strat_2018'],1) 5 y_train = city_strategies['strat_2018'] 6 NameError: name 'city_strategies' is not defined
#optimization of the parameters
param_grid = {
'n_estimators': [100, 150, 200],#The number of trees in the forest.
'max_depth': [None, 20, 50] ,#The maximum depth of the tree.
'max_features': ['sqrt', None],#he number of features to consider when looking for the best split
'min_samples_split': [2, 5, 10],#The minimum number of samples required to split an internal node
'bootstrap': [True, False]#Whether bootstrap samples are used when building trees.
}
# choosign best features for the modle
sfm = SelectFromModel(RandomForestClassifier())
sfm.fit(X_train, y_train)
X_train = sfm.transform(X_train)
X_test = sfm.transform(X_test)
# Create a grid search object
gsRFC = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, cv=5)
# Fit
gsRFC.fit(X_train, y_train)
gsRFC.best_params_
best_model = gsRFC.best_estimator_
ypred = best_model.predict(X_test)
confusion_matrix = pd.crosstab(np.array(city_strategies['strat_2019']),np.array(ypred), rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
plt.show()
from sklearn.metrics import accuracy_score
accuracy_score(np.array(city_strategies['strat_2019']), ypred)
# import pandas as pd
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# #Choosing Decision Tree with 1 level as the weak learner
# DTC=DecisionTreeClassifier(max_depth=1)
# clf = AdaBoostClassifier(n_estimators=50, base_estimator=DTC ,learning_rate=1)
# #for training we will use 2018:
# X_train=ada_df_train.drop(['2019'],1)
# y_train=ada_df_train["2019"]
# #for testing: using year 2019:
# X_test=ada_df_test.drop(['2019'],1)
# y_test=ada_df_test["2019"]
# Predictors=['symbol','2018','2017','2016','2015','2014']
# Predictors=list(X_train.columns)
# #Printing all the parameters of Adaboost
# # print(clf)
# #Creating the model on Training Data
# AB=clf.fit(X_train,y_train)
# prediction=AB.predict(X_test)
# # #Measuring accuracy on Testing Data
# # from sklearn import metrics
# # print(metrics.classification_report(y_test, prediction))
# # print(metrics.confusion_matrix(y_test, prediction))
# #Plotting the feature importance for Top 10 most important columns
# %matplotlib inline
# feature_importances = pd.Series(AB.feature_importances_, index=Predictors)
# feature_importances.nlargest(10).plot(kind='barh')
# #Printing some sample values of prediction
# TestingDataResults=pd.DataFrame(data=X_test, columns=Predictors)
# TestingDataResults['TargetColumn']=y_test
# TestingDataResults['Prediction']=prediction
# TestingDataResults.head()
# # summ = to_plot.sum()
# # df["district_crime"] =crimes_TOTAL_cities["district"]/all_crimes['Total_crimes_all_cities_4_years']
# new=df.groupby(['district']).sum()
# #training the algorithm to predict year 2018:
# X = data_rand_forest.drop(['2019'],1)
# y= data_rand_forest["2019"]
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
# # #testing would be to predict year 2019
# # X_test = data_rand_forest.drop('2019',1)
# # y_test=data_rand_forest["2019"]
# from sklearn.cluster import KMeans
# me = np.column_stack((df_clustering['settelment_size'],df_clustering['crime_precentage'] ))
# km_res = KMeans( n_clusters = 3).fit(me)
# clusters= km_res.cluster_centers_
# plt.scatter(df_clustering['settelment_size'],df_clustering['crime_precentage'] )
# plt.scatter(clusters[:,0], clusters[:,1])
# shift_df= crimes_TOTAL_cities[["2018", 'population_2018', 'shape']]
# shift_df
# shift = np.column_stack((shift_df['2018'],shift_df['population_2018'],shift_df['shape'] ))
# from sklearn.cluster import MeanShift
# ms = MeanShift()
# ms.fit(shift)
# cluster_centers = ms.cluster_centers_
# # Finally We plot the data points
# # and centroids in a 3D graph.
# fig = plt.figure(figsize= [15, 15])
# ax = fig.add_subplot(111, projection ='3d')
# ax.scatter(shift[:, 0], shift[:, 1], shift[:, 2], marker ='o')
# ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1],
# cluster_centers[:, 2], marker ='x', color ='red',
# s = 300, linewidth = 5, zorder = 10)
# plt.show()
# total=crimes_TOTAL_cities.drop(['hebrew_name','coordinates','police space', 'english_name',
# 'Metropolitan affiliation','Local authority cluster','location',
# 'religion', 'shape','Organizational affiliation',
# 'district','natural_zone','Municipal status','Planning Committee'
# ,'subdistrict','height'],1)
# train= total.copy()
# #Converge the data into (x,y) dots to help plotting
# xy = PCA(n_components=2).fit_transform(train)
# #To be able to plot 3D scatterplot converge to (x,y,z) tuples aswell
# xyz = PCA(n_components=3).fit_transform(train)
# #Save in dataframes
# threeD = pd.DataFrame(xyz)
# pca_df = pd.DataFrame(xy)
# # Plot the data on a graph to see the nodes before and after clustering.
# fig = plotly.subplots.make_subplots(rows=1, cols=2, horizontal_spacing=0.003, specs=[[{"type": "xy"},{"type": "scatter3d"}]])
# fig.add_trace(
# go.Scatter( x=pca_df[0], y=pca_df[1],marker_symbol='hexagon2', mode="markers+text",
# marker=dict(size=12,color='rgba(135, 206, 250, 0.7)', line=dict(width=1, color='DarkSlateGrey'))),
# row=1, col=1)
# fig.add_trace(
# go.Scatter3d(x=threeD[0], y=threeD[1], z=threeD[2],
# mode="markers",marker_symbol='circle-open', marker=dict(size=7)),
# row=1, col=2)
# fig.update_xaxes(title_text="x", row=1, col=1)
# fig.update_yaxes(title_text="y", row=1, col=1)
# fig.update_layout(height=600, width=1000,
# title_text="Visualizing the data in 2D and 3D graphs", showlegend=False)
# fig.show()
# total.describe()
# # first we need to check the optimal k so we check 15 values for k and descide the best of them acording to the sse
# sse = []
# # kmeans only deals with numerci features
# cols = list(train.columns)
# ktrain=train[cols]
# ktest=train[cols]
# k_rng = range(1,15)
# for k in k_rng:
# km = KMeans(n_clusters=k)
# km.fit(ktrain)
# sse.append(km.inertia_)
# '''we use The Elbow Method that shows the optimal k for
# the algorethim which means the number of the clusters
# that we will get, when we display the 'Sum of squared error'
# acording to each K and the connecting between the points we
# will get and elbow shape the closest k to the elbow it will be the optimal k '''
# plt.figure(figsize=(20,10))
# plt.plot(k_rng, sse , 'o-', markersize=10)
# plt.xlabel('K',fontsize = 20)
# plt.ylabel('Sum of squared error',fontsize = 20)
# plt.title('The Elbow Method showing the optimal k',fontsize = 30)
# km = KMeans(n_clusters=4) # we try k = 4
# # predict clustering for the train and for the test to
# # keep balance between the features in eache data
# y_predicted = km.fit_predict(ktrain)
# # describing the clusters in scatter plot k = 2
# clust_data = pd.DataFrame(xy) # the xy is used from the pca section 2+3
# plt.scatter(clust_data[0], clust_data[1], c=y_predicted, s=50, cmap='viridis');
# #only relevent locations:
# data_adaboost= numeric_df[(numeric_df.symbol == 6800) | (numeric_df.symbol == 26) | (numeric_df.symbol==2600 )
# | (numeric_df.symbol== 9000 ) | (numeric_df.symbol==7500 )]
# #splittiong the data to training data and testing data:
# #training for finding data about year 2018 as a training method:
# X_train=data_adaboost.drop(['2018'],1)
# y_train=data_adaboost["2018"]
# #for testing: using year 2019:
# X_test=data_adaboost.drop(['2019'],1)
# y_test=data_adaboost["2019"]
# adb = AdaBoostRegressor()
# adb_param_grid = {'n_estimators':[50,100,150,200,250], #Number of weak learners to train iteratively.,
# 'learning_rate':[0.001, 0.01, 0.1, 1], #It contributes to the weights of weak learners. It uses 1 as a default value.,
# 'random_state': [1]}
# gsADB = GridSearchCV(adb,param_grid = adb_param_grid, cv=5, scoring="accuracy", n_jobs= -1)
# gsADB.fit(X_train,y_train)
# ADB_best = gsADB.best_estimator_
# print("Best Parameters:\n", gsADB.best_params_)
# # Create gridsearch
# ada_paramgrid = {'n_estimators':[50,100,150,200,250],
# 'learning_rate':[0.001, 0.01, 0.1, 1],
# 'random_state': [1]}
# def select_features(X_train, y_train, X_test, partyname):
# '''
# Given xtrain ytrain xtest this method selects important features based on
# adaboost regressor and return the new xtrain and xtest
# '''
# sfm = AdaBoostRegressor()
# sfm.fit(X_train, y_train)
# # Printing the names of the most important features
# print('selected features in ' + partyname + ':')
# for ind in sfm.get_support(indices=True):
# print(X_train.columns[ind])
# return sfm.transform(X_train), sfm.transform(X_test)
# """Select features then fit the model, given correspoded labels"""
# X1, x1_test = select_features(X_train, y_train, X_test, '2019')
# gs1 = GridSearchCV(AdaBoostRegressor(), param_grid = ada_paramgrid, cv=5, scoring = 'neg_mean_absolute_error', n_jobs= -1)
# gs1.fit(X1, y1)
# ada1 = gs1.best_estimator_
# data_adaboost=data_adaboost.drop('Total_crimes_all_cities_4_years',1)
# #Separate Target Variable and Predictor Variables
# TargetVariable='2019'
# Predictors=['symbol','Total_crimes_all_cities_4_years','desc_static_group','2018','2017','2016','2015','2014','religion','population_2018','jews & others','arab','jews out of','shape','Planning Committee']
# # Predictors=['2018','2017','2016','2015','2014']
# X=data_adaboost[Predictors].values
# y=data_adaboost[TargetVariable].values
# # #Split the data into training and testing set
# # from sklearn.model_selection import train_test_split
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
# #splittiong the data to training data and testing data:
# #training for finding data about year 2018 as a training method:
# X_train=data_adaboost.drop(['2018'],1)
# y_train=data_adaboost["2018"]
# #for testing: using year 2019:
# X_test=data_adaboost.drop(['2019'],1)
# y_test=data_adaboost["2019"]
# ###### Adaboost Regression in Python #######
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.tree import DecisionTreeRegressor
# #Choosing Decision Tree with 1 level as the weak learner
# DTR=DecisionTreeRegressor(max_depth=1)
# RegModel = AdaBoostRegressor(n_estimators=100, base_estimator=DTR ,learning_rate=1)
# #Printing all the parameters of Adaboost
# print(RegModel)
# #Creating the model on Training Data
# AB=RegModel.fit(X_train,y_train)
# prediction=AB.predict(X_test)
# #Measuring Goodness of fit in Training data
# from sklearn import metrics
# print('R2 Value:',metrics.r2_score(y_train, AB.predict(X_train)))
# #Measuring accuracy on Testing Data
# print('Accuracy',100- (np.mean(np.abs((y_test - prediction) / y_test)) * 100))
# #Plotting the feature importance for Top 10 most important columns
# %matplotlib inline
# feature_importances = pd.Series(AB.feature_importances_, index=Predictors)
# feature_importances.nlargest(10).plot(kind='barh')
# #Printing some sample values of prediction
# TestingDataResults=pd.DataFrame(data=X_test, columns=Predictors)
# TestingDataResults[TargetVariable]=y_test
# TestingDataResults[('Predicted'+TargetVariable)]=prediction
# TestingDataResults.head()
# data_adaboost_Ata
# import pandas as pd
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# #Choosing Decision Tree with 1 level as the weak learner
# DTC=DecisionTreeClassifier(max_depth=1)
# clf = AdaBoostClassifier(n_estimators=30, base_estimator=DTC ,learning_rate=1)
# #only relevent locations:
# data_adaboost_Ata= numeric_df[(numeric_df.symbol == 6800)]
# data_adaboost_Ata= data_adaboost_Ata.drop(['religion','desc_static_group','Total_crimes_all_cities_4_years','symbol','population_2018','jews & others','arab','jews out of','shape','Planning Committee'],1)
# # data_adaboost_Ata= numeric_df[(numeric_df.symbol == 6800) | (numeric_df.symbol == 26) | (numeric_df.symbol==2600 )
# # | (numeric_df.symbol== 9000 ) | (numeric_df.symbol==7500 )]
# X_train=data_adaboost_Ata.drop(['2018'],1)
# y_train=data_adaboost_Ata["2018"]
# #for testing: using year 2019:
# X_test=data_adaboost_Ata.drop(['2019'],1)
# y_test=data_adaboost_Ata["2019"]
# # TargetVariable='2019'
# Predictors=['2018','2017','2016','2015','2014']
# # # Predictors=['2018','2017','2016','2015','2014']
# # X=data_adaboost_Ata[Predictors].values
# # y=data_adaboost_Ata[TargetVariable].values
# # #Split the data into training and testing set
# # from sklearn.model_selection import train_test_split
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=49)
# #Printing all the parameters of Adaboost
# print(clf)
# #Creating the model on Training Data
# AB=clf.fit(X_train,y_train)
# prediction=AB.predict(X_test)
# #Measuring accuracy on Testing Data
# from sklearn import metrics
# # print(metrics.classification_report(y_test, prediction))
# # print(metrics.confusion_matrix(y_test, prediction))
# #Plotting the feature importance for Top 10 most important columns
# %matplotlib inline
# feature_importances = pd.Series(AB.feature_importances_, index=Predictors)
# feature_importances.nlargest(10).plot(kind='barh')
# #Printing some sample values of prediction
# TestingDataResults=pd.DataFrame(data=X_test, columns=Predictors)
# TestingDataResults['TargetColumn']=y_test
# TestingDataResults['Prediction']=prediction
# TestingDataResults.head()
# import pandas as pd
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# #Choosing Decision Tree with 1 level as the weak learner
# DTC=DecisionTreeClassifier(max_depth=1)
# clf = AdaBoostClassifier(n_estimators=30, base_estimator=DTC ,learning_rate=1)
# #only relevent locations:
# data_adaboost_Rosh_Pina= numeric_df[(numeric_df.symbol == 26)]
# # data_adaboost_Ata.drop(['symbol','population_2018','jews & others','arab','jews out of','shape','Planning Committee'],1)
# # # data_adaboost_Ata= numeric_df[(numeric_df.symbol == 6800) | (numeric_df.symbol == 26) | (numeric_df.symbol==2600 )
# # | (numeric_df.symbol== 9000 ) | (numeric_df.symbol==7500 )]
# X_train=data_adaboost_Rosh_Pina.drop(['2018'],1)
# y_train=data_adaboost_Rosh_Pina["2018"]
# #for testing: using year 2019:
# X_test=data_adaboost.drop(['2019'],1)
# y_test=data_adaboost["2019"]
# #Printing all the parameters of Adaboost
# print(clf)
# #Creating the model on Training Data
# AB=clf.fit(X_train,y_train)
# prediction=AB.predict(X_test)
# #Measuring accuracy on Testing Data
# from sklearn import metrics
# # print(metrics.classification_report(y_test, prediction))
# # print(metrics.confusion_matrix(y_test, prediction))
# #Plotting the feature importance for Top 10 most important columns
# %matplotlib inline
# feature_importances = pd.Series(AB.feature_importances_, index=Predictors)
# feature_importances.nlargest(10).plot(kind='barh')
# #Printing some sample values of prediction
# TestingDataResults=pd.DataFrame(data=X_test, columns=Predictors)
# TestingDataResults['TargetColumn']=y_test
# TestingDataResults['Prediction']=prediction
# TestingDataResults.head()
# mse1 = mean_squared_error(y_test, prediction)
# print('MSE of Blue-White model ' + str(round(mse1,2)))
# dd= d.drop(['symbol','Total_crimes_all_cities_4_years'],axis=1)
# X_train=dd.drop(['2018'],1)
# y_train=dd["2018"]
# #for testing: using year 2019:
# X_test=dd.drop(['2019'],1)
# y_test=dd["2019"]
# #Create the charts
# test_trace = go.Bar(name='Test', x=data_plot['Settlement'], y=data_plot['Test'], text=data_plot['Test'], textposition='auto',
# marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6)
# pred_trace = go.Bar(name='Pred', x=data_plot['Settlement'], y=data_plot['Pred'], text=data_plot['Pred'], textposition='auto',
# marker_color='rgb(180,300,200)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
# rf_fig = plotly.subplots.make_subplots(rows=1, cols=1, shared_xaxes=True, vertical_spacing=0.02)
# rf_fig.append_trace(test_trace, 1,1)
# rf_fig.append_trace(pred_trace, 1,1)
# rf_fig.update_xaxes(title_text="Settlement", row=1, col=1)
# rf_fig.update_yaxes(title_text="Second most common crime size", row=1, col=1)
# rf_fig['layout'].update(height=600, width=950, title={'text': "Final Results Comparison- 2nd most common crime in 2019", 'y':0.9, 'x':0.5, 'xanchor': 'center','yanchor': 'top'})
# rf_fig.show()
# gs2 = GridSearchCV(AdaBoostRegressor(), param_grid = ada_paramgrid, cv=5, scoring = 'neg_mean_absolute_error', n_jobs= -1)
# gs2.fit(X, y)
# ada2 = gs2.best_estimator_
# """Predict Data for Blue White"""
# y_pred = ada2.predict(X_test)
# mse1 = mean_squared_error(y_test, y_pred)
# print('MSE of Blue-White model ' + str(round(mse1,2)))
# # """Predict Data for Joint List"""
# # y_pred2 = ada2.predict(x2_test)
# # mse2 = mean_squared_error(y_test2, y_pred2)
# # print('MSE of Joint-list model ' + str(round(mse2,2)))
# # """Predict Data for Likude"""
# # y_pred3 = ada3.predict(x3_test)
# # mse3 = mean_squared_error(y_test3, y_pred3)
# # print('MSE of Likude model ' + str(round(mse3,2)))
# import math
# # pred= math.ceil(ypred)
# pred = [math.ceil(num) for num in ypred]
# # d = {'Actual':y_test,'Predicted':pred}
# # to_g= pd.DataFrame(d)
# plt.figure(figsize=(10,10))
# plt.scatter(y_test, pred, c='crimson')
# p1 = max(max(pred), max(y_test))
# p2 = min(min(pred), min(y_test))
# plt.plot([p1, p2], [p1, p2], 'b-')
# plt.xlabel('True Values', fontsize=15)
# plt.ylabel('Predictions', fontsize=15)
# plt.axis('equal')
# plt.show()
# df_rf = df_crime.copy()
# df_rf = df_rf.drop(['location'],1)
# # df_rf = df_rf[(df_rf.symbol == (3780 or 43 or 507 or 1139 or 2640 or 7400))]
# df_rf = df_rf[(df_rf.symbol == 3780) | (df_rf.symbol == 507) | (df_rf.symbol==1139 )
# | (df_rf.symbol==43 ) | (df_rf.symbol==2640 ) | (df_rf.symbol==7400 )]
# # df_rf = df_rf[df_rf.desc_static_group != 'Total']
# gil= df_rf.drop(['symbol','Total_crimes_all_cities_4_years'],1)
# totals =gil.groupby(['desc_static_group']).agg(lambda x:x.value_counts().index[0])
# totals =df_rf.groupby(['desc_static_group','symbol']).agg(lambda x:x.value_counts().index[0])
# totals =df_rf.groupby(['desc_static_group','symbol']).agg(lambda x:x.value_counts().index[0])
# totals = df_rf.groupby('symbol').mode()
# # Note nth is 0 indexed
# second = df_rf.sort_values('n', ascending=False).groupby('symbol','desc_static_group').y.nth(1)
# ans = pd.DataFrame({'n': totals, 'second': second})
# totals.head(20)
# totals["max"] = totals[["2018", "2017","2016", "2015","2014","2019"]].max(axis=1)
# totals.head(10)
# ytrain = df_rf['Total_crimes_all_cities_4_years']
# # ytrain=pd.DataFrame(ytrain)
# # ytrain = ytrain[ytrain.desc_static_group != 'Total']
# xtrain= df_rf.drop(['Total_crimes_all_cities_4_years','symbol','desc_static_group'], 1)
# # df_rf = df_rf[df_rf.desc_static_group != 'Total']
# param_grid = {
# 'criterion':['mse'],
# 'n_estimators': [150, 180, 200, 220], # The number of trees in the forest.
# 'max_depth': [None, 40, 60, 80], # The maximum depth of the tree.
# 'max_features': ['sqrt',4,5], # he number of features to consider when looking for the best split
# 'min_samples_split': [None, 3, 4], # The minimum number of samples required to split an internal node
# 'bootstrap': [True] # Whether bootstrap samples are used when building trees.
# }
# # Create a grid search object
# gsRFC = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=-1, cv=5)
# # Fit
# gsRFC.fit(xtrain, ytrain)
# gsRFC.best_params_
# df_rf
# #group by-
# tr= df_rf(df_rf.desc_static_group!= 'Total')
# tr= tr.groupby(['symbol' ])
# tr=tr['Total_crimes_all_cities_4_years'].nlargest(2)
# tr.head(20)
# df1_transposed = totals.T # or df1.transpose()
# df1_transposed['second_common']=''
# #function to set for each district code- its name:
# def name_offences(df):
# if name == 'body_offences':
# name=1
# if name =="definition_section_offences":
# name = 2
# if name =="financial_offences":
# name = 3
# if name == "fraud_offences":
# name = 4
# if name == "human_offences":
# name= 5
# if name == 'licensing_offences':
# name = 6
# if name == "managment_offences":
# name = 7
# if name == "moral_offences":
# name = 8
# if name = "other_offences":
# name= 9
# if name == "property_offences":
# name = 10
# if name == "public_order_offences":
# name = 11
# if name == "security_offences":
# name = 12
# if name == "sexual_offences":
# name = 13
# if name == "traffic_offences":
# name == 14
# second_common = ()
# df1_transposed.drop('Total',1)
# df1_transposed["year"]=df1_transposed.rename(columns={"desc_static_group": "year"})
# df1_transposed=pd.DataFrame(df1_transposed)
# df1_transposed
# settelments=(3780, 507, 1139 , 43, 2640, 7400)
# test = df_crime[['2019', 'Total_crimes_all_cities_4_years', 'symbol']]
# ytest= df_rf['Total_crimes_all_cities_4_years'] #label
# xtest= df_rf.drop(['Total_crimes_all_cities_4_years','symbol','desc_static_group'], 1)
# # run prediction
# new_best_model = gsRFC.best_estimator_
# ypred = new_best_model.predict(xtest)
# # calulate MSE
# mse = mean_squared_error(ytest, ypred)
# print('The model\'s MSE is ' + str(round(mse, 2)))
# # Create and get relevant data only
# rf_graph = df_rf[df_rf['symbol'].isin(settelments)].copy()
# rf_graph['Pred'] = np.round(ypred)
# rf_graph = rf_graph.rename(columns={'desc_static_group': 'Settlement', 'Total_crimes_all_cities_4_years':'Test'})
# rf_graph['MSE'] = (rf_graph['Pred']-rf_graph['Test'])**2
# rf_graph['logMSE'] = np.round(np.log(rf_graph['MSE']), 2)
# #Now we'll display the results in a table; actual number of legal votes, predicted number with MSE and log(MSE)
# fig = go.Figure(data=[go.Table(
# header=dict(values=['<b>Settlement</b>', '<b>Actual</b>', '<b>Predicted</b>', '<b>MSE</b>' ,'<b>log(MSE)</b>'],
# line_color='darkslategray', fill_color='rgb(158,202,225)',
# font=dict(size=13), align='center'),
# cells=dict(values=[rf_graph['Settlement'], rf_graph['Test'], rf_graph['Pred'], rf_graph['MSE'] ,rf_graph['logMSE']],
# line_color='darkslategray', fill_color = ['rgb(222,252,229)','white'],
# align = 'center', font = dict(color = 'darkslategray', size = 13) ))
# ])
# fig.update_layout(width=800, height=315, title='Actual number of legal votes vs. predicted number of legal votes with log(MSE)')
# fig.show()
# gmm = GaussianMixture(n_components=4, random_state=42)
# plot_gmm(gmm, to_GMM)
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
# crimes_TOTAL_cities
# fig, ax = plt.subplots(figsize=(9.2, 10))
# plt.barh(df_crime["desc_static_group"].unique(),df_crime["desc_static_group"].value_counts())
# df_corrolation= crimes_TOTAL_cities.drop(['english_name','2019','2018', '2017','2016','2015','2014','hebrew_name','location'],1)
# to_pca = crimes_TOTAL_cities.drop('location', 1)
# to_pca=crimes_TOTAL_cities.drop('hebrew_name', 1)
# to_pca = crimes_TOTAL_cities.drop(['location','hebrew_name', 'Local authority cluster'
# ,'Metropolitan affiliation','english_name','2019','2018','2017',
# '2016','2015','2014'], 1)
# to_pca['natural_zone'] = to_pca['natural_zone'].fillna(0)
# from sklearn.decomposition import PCA
# #Converge the data into (x,y) dots to help plotting
# xy = PCA(n_components=2).fit_transform(to_pca)
# #To be able to plot 3D scatterplot converge to (x,y,z) tuples aswell
# xyz = PCA(n_components=3).fit_transform(to_pca)
# #Save in dataframes
# threeD = pd.DataFrame(xyz)
# pca_df = pd.DataFrame(xy)
# import plotly
# import plotly.graph_objects as go
# # Plot the data on a graph to see the nodes before and after clustering.
# fig = plotly.subplots.make_subplots(rows=1, cols=2, horizontal_spacing=0.003, specs=[[{"type": "xy"},{"type": "scatter3d"}]])
# fig.add_trace(
# go.Scatter( x=pca_df[0], y=pca_df[1],marker_symbol='hexagon2', mode="markers+text",
# marker=dict(size=12,color='rgba(135, 206, 250, 0.7)', line=dict(width=1, color='DarkSlateGrey'))),
# row=1, col=1)
# fig.add_trace(
# go.Scatter3d(x=threeD[0], y=threeD[1], z=threeD[2],
# mode="markers",marker_symbol='circle-open', marker=dict(size=7)),
# row=1, col=2)
# fig.update_xaxes(title_text="x", row=1, col=1)
# fig.update_yaxes(title_text="y", row=1, col=1)
# fig.update_layout(height=600, width=1000,
# title_text="Visualizing the data in 2D and 3D graphs", showlegend=False)
# fig.show()
# # these graphs dont realy gives somthing interesting
# #Importing required modules
# from sklearn.datasets import load_digits
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans
# import numpy as np
# #
# #Load Data
# pca = PCA(2)
# #Transform the data
# df = pca.fit_transform(to_pca)
# #Import required module
# from sklearn.cluster import KMeans
# #Initialize the class object
# kmeans = KMeans(n_clusters= 10)
# #predict the labels of clusters.
# label = kmeans.fit_predict(to_pca)
# print(label)
# import matplotlib.pyplot as plt
# #filter rows of original data
# filtered_label0 = df[label == 0]
# #plotting the results
# plt.scatter(filtered_label0[:,0] , filtered_label0[:,1])
# plt.show()
# #filter rows of original data
# filtered_label2 = df[label == 2]
# filtered_label8 = df[label == 8]
# #Plotting the results
# plt.scatter(filtered_label2[:,0] , filtered_label2[:,1] , color = 'red')
# plt.scatter(filtered_label8[:,0] , filtered_label8[:,1] , color = 'black')
# plt.show()
# #Getting unique labels
# u_labels = np.unique(label)
# #plotting the results:
# for i in u_labels:
# plt.scatter(to_pca[label == i , 0] , to_pca[label == i , 1] , label = i)
# plt.legend()
# plt.show()
# def elbow_met(df):
# from scipy.spatial.distance import cdist
# # k means determine k
# distortions = []
# K = range(1,10)
# for k in K:
# kmeanModel = KMeans(n_clusters=k).fit(to_pca)
# kmeanModel.fit(to_pca)
# distortions.append(sum(np.min(cdist(to_pca, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / df.shape[0])
# # Plot the elbow
# plt.plot(K, distortions, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Within groups sum of squares')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()
# from sklearn.cluster import KMeans
# elbow_met(to_pca)
# kmeans = KMeans(n_clusters=5).fit(to_pca)
# centroids = kmeans.cluster_centers_
# print(centroids)
# df=pd.DataFrame(df)
# plt.scatter(df[0], df[1], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
# plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
# plt.show()
# conda install -c plotly plotly
# import plotly.offline as py
# import plotly.express as px
# import plotly.graph_objects as go
# import plotly
# import re
# # Plot the data on a graph to see the nodes before and after clustering.
# fig = plotly.subplots.make_subplots(rows=1, cols=2, horizontal_spacing=0.03, specs=[[{"type": "xy"},{"type": "scatter3d"}]])
# fig.add_trace(
# go.Scatter( x=df[0], y=df[1],marker_symbol='hexagon2', mode="markers+text",
# marker=dict(size=12,color='rgba(135, 206, 250, 0.7)', line=dict(width=1, color='DarkSlateGrey'))),
# row=1, col=1)
# fig.add_trace(
# go.Scatter( x=df[0], y=df[1],marker_symbol='hexagon2', mode="markers+text",
# marker=dict(size=12,color='rgba(135, 206, 250, 0.7)', line=dict(width=1, color='DarkSlateGrey'))),
# row=1, col=1)
# fig.update_xaxes(title_text="x", row=1, col=1)
# fig.update_yaxes(title_text="y", row=1, col=1)
# fig.update_layout(height=600, width=1000,
# title_text="Plotting data after lowering the it's dimention, once in 2D the other 3D", showlegend=False)
# fig.show()
together.to_excel("f.xlsx")
# Total_all_row = df_crime.loc[0]
# Total_all_row
# conda install -c anaconda basemap
# # 1. Draw the map background
# fig = plt.figure(figsize=(8, 8))
# m = Basemap(projection='lcc', resolution='h',
# lat_0=37.5, lon_0=-119,
# width=1E6, height=1.2E6)
# m.shadedrelief()
# m.drawcoastlines(color='gray')
# m.drawcountries(color='gray')
# m.drawstates(color='gray')
# # 2. scatter city data, with color reflecting population
# # and size reflecting area
# m.scatter(lon, lat, latlon=True,
# c=np.log10(population), s=area,
# cmap='Reds', alpha=0.5)
# # 3. create colorbar and legend
# plt.colorbar(label=r'$\log_{10}({\rm population})$')
# plt.clim(3, 7)
# # make legend with dummy points
# for a in [100, 300, 500]:
# plt.scatter([], [], c='k', alpha=0.5, s=a,
# label=str(a) + ' km$^2$')
# plt.legend(scatterpoints=1, frameon=False,
# labelspacing=1, loc='lower left');
# import seaborn as sns
# # Plot
# plt.figure(figsize=(12,10), dpi= 80)
# sns.heatmap(df_corrolatino.corr(), xticklabels=df_corrolatino.corr().columns, yticklabels=df_corrolatino.corr().columns, cmap='RdYlGn', center=0, annot=True)
# # Decorations
# plt.title('Correlogram of mtcars', fontsize=22)
# plt.xticks(fontsize=12)
# plt.yticks(fontsize=12)
# plt.show()
# # Draw Plot
# import scipy.cluster.hierarchy as shc
# gil_crimes_cities = crimes_cities.replace(['-'], 0 )
# # Plot
# plt.figure(figsize=(16, 10), dpi= 80)
# plt.title("USArrests Dendograms", fontsize=22)
# dend = shc.dendrogram(shc.linkage(gil_crimes_cities[['Total_crimes_all_cities_4_years']], method='ward'), labels=gil_crimes_cities.english_name.values, color_threshold=100)
# plt.xticks(fontsize=12)
# plt.show()
# crimes_cities
X_train.to_excel("c.xlsx")
# from sklearn.datasets.samples_generator import make_blobs
# X, y_true = make_blobs(n_samples=300, centers=4,
# cluster_std=0.60, random_state=0)
# plt.scatter(X[:, 0], X[:, 1], s=50);
# def elbow_met(df):
# from scipy.spatial.distance import cdist
# # k means determine k
# distortions = []
# K = range(1,10)
# for k in K:
# kmeanModel = KMeans(n_clusters=k).fit(df)
# kmeanModel.fit(df)
# distortions.append(sum(np.min(cdist(df, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / df.shape[0])
# # Plot the elbow
# plt.plot(K, distortions, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Within groups sum of squares')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()